{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#在已经给的两个代码中，分别查看了各个数据的情况，并把train和test中出现的user和event数据进行了抽取，\n",
    "#只需要处理这一部分的数据就可以，抽取的数据存放为train_data.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#第一步导入需要使用到的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn import metrics\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "import time\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>event_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>start_time</th>\n",
       "      <th>city</th>\n",
       "      <th>state</th>\n",
       "      <th>zip</th>\n",
       "      <th>country</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>...</th>\n",
       "      <th>c_92</th>\n",
       "      <th>c_93</th>\n",
       "      <th>c_94</th>\n",
       "      <th>c_95</th>\n",
       "      <th>c_96</th>\n",
       "      <th>c_97</th>\n",
       "      <th>c_98</th>\n",
       "      <th>c_99</th>\n",
       "      <th>c_100</th>\n",
       "      <th>c_other</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>684921758</td>\n",
       "      <td>3647864012</td>\n",
       "      <td>2012-10-31T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>244999119</td>\n",
       "      <td>3476440521</td>\n",
       "      <td>2012-11-03T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>3928440935</td>\n",
       "      <td>517514445</td>\n",
       "      <td>2012-11-05T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2582345152</td>\n",
       "      <td>781585781</td>\n",
       "      <td>2012-10-30T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1051165850</td>\n",
       "      <td>1016098580</td>\n",
       "      <td>2012-09-27T00:00:00.001Z</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 111 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0    event_id     user_id                start_time city state  \\\n",
       "0           0   684921758  3647864012  2012-10-31T00:00:00.001Z  NaN   NaN   \n",
       "1           1   244999119  3476440521  2012-11-03T00:00:00.001Z  NaN   NaN   \n",
       "2           2  3928440935   517514445  2012-11-05T00:00:00.001Z  NaN   NaN   \n",
       "3           3  2582345152   781585781  2012-10-30T00:00:00.001Z  NaN   NaN   \n",
       "4           4  1051165850  1016098580  2012-09-27T00:00:00.001Z  NaN   NaN   \n",
       "\n",
       "   zip country  lat  lng   ...     c_92  c_93  c_94  c_95  c_96  c_97  c_98  \\\n",
       "0  NaN     NaN  NaN  NaN   ...        0     1     0     0     0     0     0   \n",
       "1  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "2  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "3  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "4  NaN     NaN  NaN  NaN   ...        0     0     0     0     0     0     0   \n",
       "\n",
       "   c_99  c_100  c_other  \n",
       "0     0      0        9  \n",
       "1     0      0        7  \n",
       "2     0      0       12  \n",
       "3     0      0        8  \n",
       "4     0      0        9  \n",
       "\n",
       "[5 rows x 111 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#把刚才抽取的训练集和测试集中出现过的活动列表给读取进来。并打印出来看一下\n",
    "train = pd.read_csv('train_data.csv')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the shape of train_image: (13418, 102)\n"
     ]
    }
   ],
   "source": [
    "#设置训练数据，打印出来数据的信息。\n",
    "X_train = train.drop(['event_id','user_id','start_time','city','state','zip','country','lat','lng'],axis=1)\n",
    "print('the shape of train_image: {}'.format(X_train.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(13418, 75)\n"
     ]
    }
   ],
   "source": [
    "#对训练数据进行PCA降维，并打印降维后的特征数。\n",
    "pca = PCA(n_components=75)\n",
    "pca.fit(X_train)\n",
    "\n",
    "X_train_pca = pca.transform(X_train)\n",
    "\n",
    "print(X_train_pca.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#进行聚类，首先是以一个参数点的模型，在校验集上面评价聚类算法的性能，使用MinBatchKMeans\n",
    "#可以更快的获得计算结果，剑圣计算时间，结果的质量会降低（在实践中，质量差异可能相当小）\n",
    "def K_cluster_analysis(K, X_train):\n",
    "    start = time.time()\n",
    "    \n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    mb_kmeans.fit(X_train)\n",
    "\n",
    "    CH_score = metrics.silhouette_score(X_train,mb_kmeans.predict(X_train))\n",
    "    \n",
    "    \n",
    "    end = time.time()\n",
    "    print(\"CH_score: {}, time elaps:{}\".format(CH_score, int(end-start)))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 0.5557358801980513, time elaps:5\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 0.5252317712881518, time elaps:4\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 0.49543353836494786, time elaps:3\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 0.46994192167273113, time elaps:3\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 0.4471222021886594, time elaps:3\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 0.4219861392167631, time elaps:3\n",
      "K-means begin with clusters: 60\n",
      "CH_score: 0.41179933559426357, time elaps:3\n",
      "K-means begin with clusters: 70\n",
      "CH_score: 0.39067002223541103, time elaps:3\n",
      "K-means begin with clusters: 80\n",
      "CH_score: 0.38379070160293327, time elaps:3\n",
      "K-means begin with clusters: 90\n",
      "CH_score: 0.36749372871411023, time elaps:3\n",
      "K-means begin with clusters: 100\n",
      "CH_score: 0.3606651948969119, time elaps:4\n"
     ]
    }
   ],
   "source": [
    "# 调整K值，找到最佳的参数。\n",
    "Ks = [5, 10, 20, 30,40,50,60,70,80,90,100]\n",
    "CH_scores = []\n",
    "v_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x24dd1371978>]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAD8CAYAAAB3u9PLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XuclnP+x/HXp0kjVA6Nw9bYiiwjdLhV/JTFxjiVjaxYZJFF2l2HdVgtxmHp4bRWe6gQFmkVpWVbrNZhle5IShtTolHLIGoXJX1+f3yv0S1T992crvvwfj4e92Pu63tf1zWfy4x5d13f6/p+zd0RERFpFncBIiKSHRQIIiICKBBERCSiQBAREUCBICIiEQWCiIgACgQREYkoEEREBFAgiIhIpHncBWyOtm3beocOHeIuQ0Qkp8yePftDdy9Jt15OBUKHDh1IJpNxlyEiklPM7J1M1tMlIxERARQIIiISUSCIiAigQBARkYgCQUREAAWCiIhEFAgiIgIUSCA8+iiMHh13FSIi2a0gAuGBB+Dii+Gjj+KuREQkexVEIFRUwH//CyNHxl2JiEj2yigQzKzczBaaWaWZXVbL50PMrNrM5kSvs1I++yqlfUpKe0czm2lmb5nZw2bWomEO6dvKyuCUU+B3v4Plyxvru4iI5La0gWBmRcAo4EigDBhsZmW1rPqwu3eNXmNT2j9Pae+f0n4TcJu7dwZWAGfW/TDSu/pqWLMGbrihMb+LiEjuyuQMoSdQ6e6L3X0NMB4YUJ9vamYGHAo8EjXdCxxXn32ms9tu8JOfwJ/+BO9kNMyTiEhhySQQ2gFLU5arorYNHW9mc83sETMrTWnf0sySZjbDzGr+6O8AfOLua9Pss0GNGAFmcO21jf2dRERyTyaBYLW0+QbLjwMd3H1f4GnCv/hr7OruCeBk4HYz2y3DfYZvbjY0CpRkdXV1BuVuXGkpnHsujBsHb71Vr12JiOSdTAKhCkj9F397YFnqCu7+kbuvjhbHAD1SPlsWfV0MTAe6AR8C25pZzXwM39pnyvaj3T3h7omSkrTzO6R1+eVQXBz6FEREZL1MAmEW0Dm6K6gFcBIwJXUFM9slZbE/sCBq387MiqP3bYH/A95wdweeBU6ItjkdmFyfA8nUTjvB8OHw0EMwb15TfEcRkdyQNhCi6/zDgGmEP/QT3H2+mVWYWc1dQ8PNbL6ZvQYMB4ZE7XsByaj9WeBGd38j+uxS4EIzqyT0KdzVUAeVziWXQKtWoU9BREQCC/9Yzw2JRMIbagrNigq46iqYNQsSiQbZpYhIVjKz2VFf7iYVxJPKtfn5z2GHHeDKK+OuREQkOxRsILRuDZddBtOmwfPPx12NiEj8CjYQAM47D3beGX71K8ihK2ciIo2ioANhq63CJaPnn4ennoq7GhGReBV0IACcdRZ897s6SxARKfhAKC6GX/8akkmY3CRPQoiIZKeCDwSA006DPfYIzyWsWxd3NSIi8VAgAM2bwzXXhCeXH3447mpEROKhQIiceCLss094WG3t2vTri4jkGwVCpFmzMCz2W2/BfffFXY2ISNNTIKTo3x/23z9cPlq9Ov36IiL5RIGQwgyuuw7efRfGjIm7GhGRpqVA2EC/ftC3L1x/PXz2WdzViIg0HQXCBmrOEv7zHxg1Ku5qRESajgKhFn36wBFHwI03wsqVcVcjItI0FAgbcd118PHHcPvtcVciItI0FAgbkUjAD38It9wSgkFEJN9lFAhmVm5mC82s0swuq+XzIWZWbWZzotdZUXtXM3spml5zrpn9KGWbcWb2dso2XRvusBpGRQWsWgUjR8ZdiYhI40sbCGZWBIwCjgTKgMFmVlbLqg+7e9foNTZq+ww4zd33BsqB281s25RtLknZZk79DqXhdekCgwfDHXeETmYRkXyWyRlCT6DS3Re7+xpgPDAgk527+5vu/lb0fhnwAVBS12LjcPXVsGZN+Coiks8yCYR2wNKU5aqobUPHR5eFHjGz0g0/NLOeQAtgUUrz9dE2t5lZcW3f3MyGmlnSzJLV1dUZlNuwOneGYcPgT3+CK67QnAkikr8yCQSrpW3DP4uPAx3cfV/gaeDeb+zAbBfgfuAMd68ZYPpyYE9gf2B74NLavrm7j3b3hLsnSkriObm45RYYOhR+8xu44AINkS0i+al5ButUAan/4m8PLEtdwd0/SlkcA9xUs2BmrYG/Ale6+4yUbZZHb1eb2T3AxZtXetMpKoI//hFat4abbw4dzXfdFYbNFhHJF5n8SZsFdDazjsB7wEnAyakrmNkuKX/g+wMLovYWwKPAfe7+l9q2MTMDjgPm1etIGplZuNuoTZswkc6qVfDQQ2HGNRGRfJA2ENx9rZkNA6YBRcDd7j7fzCqApLtPAYabWX9gLfAxMCTa/ESgL7CDmdW0DYnuKHrAzEoIl6TmAD9tuMNqHGZw5ZXhTOFnP4Njj4VHH4Wtt467MhGR+jPPoV7SRCLhyWQy7jIAGDcOzjwTDjgApk6FbbdNu4mISCzMbLa7J9KtpyeV62jIkDDd5ssvwyGHwAcfxF2RiEj9KBDq4YQTYMoUWLgwDJldVRV3RSIidadAqKfycpg2DZYtg4MOgsrKuCsSEakbBUID6NMHnn0W/vvf8H5eVt8vJSJSOwVCA+nRA557LtyJdPDBMGtW3BWJiGweBUIDKiuDF14IzyoceihMnx53RSIimVMgNLBOneD556G0FI48Ep54Iu6KREQyo0BoBO3ahctHe+8NAwaE21NFRLKdAqGRtG0LzzwDvXuHORXGjk2/jYhInBQIjahNm3BL6hFHwNlnw223xV2RiMjGKRAa2VZbweTJ4SG2Cy8ME+3k0GghIlJANIBzE2jRIoyMus02cM01sHJlmGPBaptpQkQkJgqEJtK8eZhDoXXrcOlo5cowC1tRUdyViYgECoQm1KwZ3H576Fu49towp8L994czCBGRuCkQmpgZVFSEM4VLLgmhMHEitGwZd2UiUujUqRyTiy8Ol4z+9rfwANvKlXFXJCKFLqNAMLNyM1toZpVmdlktnw8xs2ozmxO9zkr57HQzeyt6nZ7S3sPMXo/2eUc0lWZBGToUHngAXnwRDjsMPvoo/TYiIo0lbSCYWREwCjgSKAMGm1lZLas+7O5do9fYaNvtgauAXkBP4Coz2y5a/w/AUKBz9Cqv78HkosGDYdIkeP31MCje8uXptxERaQyZnCH0BCrdfbG7rwHGAwMy3P8RwFPu/rG7rwCeAsrNbBegtbu/5GEOz/uA4+pQf1449lh48klYsiTMqfD223FXJCKFKJNAaAcsTVmuito2dLyZzTWzR8ysNM227aL36fZZMA45JAx1sWJFmFPh3/+OuyIRKTSZBEJt1/Y3fNb2caCDu+8LPA3cm2bbTPYZdmA21MySZpasrq7OoNzc1atXGDJ77doQCq+8EndFIlJIMgmEKqA0Zbk9sCx1BXf/yN1XR4tjgB5ptq2K3m90nyn7Hu3uCXdPlJSUZFBubtt33zB89lZbhbOGF16IuyIRKRSZBMIsoLOZdTSzFsBJwJTUFaI+gRr9gQXR+2nA4Wa2XdSZfDgwzd2XA6vMrHd0d9FpwOR6Hkve6Nw5hMLOO8Phh8Pf/x53RSJSCNIGgruvBYYR/rgvACa4+3wzqzCz/tFqw81svpm9BgwHhkTbfgxcSwiVWUBF1AZwLjAWqAQWAU822FHlgV13DXMq7LFH6HSeODHuikQk35nn0NCbiUTCk8lk3GU0qRUr4KijYObM8ITzFVeEITBERDJlZrPdPZFuPf1pyXLbbQf/+AecfDKMGAGDBoXhLkREGpoCIQe0bBkGwbvlFnjsMTjgAFi0KO6qRCTfKBByhFmYYGfaNFi2DPbfX53NItKwFAg55gc/gGQS2rcPg+LdfLNmYBORhqFAyEGdOsG//gUDB4YhtH/8Y/jss7irEpFcp0DIUdtsAxMmwPXXh+k5DzoI3nkn7qpEJJcpEHKYWbgN9fHHQydzIhGGvhARqQsFQh44+mh4+WXYYYfQx3DnnepXEJHNp0DIE9/7Xnh47aij4IIL4Kyz4Isv4q5KRHKJAiGPtGkTnlMYMQLuvhu+//1wi6qISCYUCHmmWbMwxMXEiTBvHvToAS+9FHdVIpILFAh5auBAmDEjDKN98MEwdmzcFYlItlMg5LEuXWDWrDCvwtlnw/nnw5o1cVclItlKgZDntt8enngiPMD2+9+Hu5Defz/uqkQkGykQCkBREYwcCQ8+GIa9SCRg9uy4qxKRbKNAKCCDB8OLL4aO54MOgj//Oe6KRCSbZBQIZlZuZgvNrNLMLtvEeieYmZtZIlo+xczmpLzWmVnX6LPp0T5rPtuxYQ5JNqVbt3CW0Ls3nHoqXHQRrF0bd1Uikg3SBoKZFQGjgCOBMmCwmZXVsl4rwvSZM2va3P0Bd+/q7l2BU4El7j4nZbNTaj539w/qeSySoZKSMHT28OFw661QXg4ffRR3VSISt0zOEHoCle6+2N3XAOOBAbWsdy0wEtjY87GDgYfqVKU0uC22gN/+NjzA9vzzYX6FuXPjrkpE4pRJILQDlqYsV0VtXzOzbkCpu0/dxH5+xLcD4Z7octEIM7NMCpaGdcYZ8NxzsHp1mIntL3+JuyIRiUsmgVDbH+qvh04zs2bAbcBFG92BWS/gM3efl9J8irvvA/SJXqduZNuhZpY0s2R1dXUG5crm6tUr9Cvstx+ceCL86lfw1VdxVyUiTS2TQKgCSlOW2wOpI+S0AroA081sCdAbmFLTsRw5iQ3ODtz9vejrKuBBwqWpb3H30e6ecPdESUlJBuVKXeyyCzz7bHiA7YYboH9/+OSTuKsSkaaUSSDMAjqbWUcza0H44z6l5kN3/9Td27p7B3fvAMwA+rt7Er4+gxhE6HsgamtuZm2j91sAxwCpZw8Sg+JiGD0a/vjH0OncqxcsWBB3VSLSVNIGgruvBYYB04AFwAR3n29mFWbWP4Pv0ReocvfFKW3FwDQzmwvMAd4Dxmx29dIozjknnC188kkIhSlT0m8jIrnPPIdmUkkkEp5MJuMuo2AsXRoGyUsm4eqr4corw1PPIpJbzGy2uyfSracnlWWjSkvDHUinnRYC4bDDQkiISH5SIMgmtWwJ48bBvfeG8Y/22w8mTYq7KhFpDAoEScssnCW8+irsthscf3zoZ/jss7grE5GGpECQjO2+exgc79JLYcyYMGrqa6/FXZWINBQFgmyWFi3gxhvhqafCXUg9e8Idd0AO3ZsgIhuhQJA6OeywcHZwxBHws5/BMcfABxqeUCSnKRCkzkpKYPJkuPNOeOYZ2Hff8ECbiOQmBYLUi1mYq3nWLGjbNpwxXHKJ5m4WyUUKBGkQ++wTQuHcc+Hmm8PIqW++GXdVIrI5FAjSYFq2hN//Hh57DJYsge7d4Z571OEskisUCNLgBgwIk+307Ak/+UmYy1kjp4pkPwWCNIp27cKtqTfcAI88Al27hmcYRCR7KRCk0RQVweWXhyAoKoK+faGiQpPviGQrBYI0ul69wrAXJ58MV10FhxwC774bd1UisiEFgjSJ1q3h/vvD69VXwyB5jzwSd1UikkqBIE3qxz+GOXNgjz1g0KAwZef//hd3VSICCgSJwW67wQsvhP6Fu+6CHj3CWYOIxCujQDCzcjNbaGaVZnbZJtY7wczczBLRcgcz+9zM5kSvP6as28PMXo/2eYeZWf0PR3LFFluEO5CefhpWrYLeveG222DdurgrEylcaQPBzIqAUcCRQBkw2MzKalmvFTAcmLnBR4vcvWv0+mlK+x+AoUDn6FVet0OQXHbooWGQvPJyuPBCOPpoeP/9uKsSKUyZnCH0BCrdfbG7rwHGAwNqWe9aYCTwRbodmtkuQGt3f8nDpM73AcdlXrbkk7Ztw9PNo0bB9OlhkLxp0+KuSqTwZBII7YDUmXSroravmVk3oNTdp9ayfUcze9XM/mlmfVL2WbWpfabse6iZJc0sWV1dnUG5kovM4LzzwnhIO+4YzhguughWr467MpHCkUkg1HZt/+vRacysGXAbcFEt6y0HdnX3bsCFwINm1jrdPr/R6D7a3RPunigpKcmgXMllXbrAyy+HEVRvvTUMkrdwYdxViRSGTAKhCihNWW4PLEtZbgV0Aaab2RKgNzDFzBLuvtrdPwJw99nAImCPaJ/tN7FPKWAtW4Y5FiZPDg+wde8Od98dd1Ui+S+TQJgFdDazjmbWAjgJmFLzobt/6u5t3b2Du3cAZgD93T1pZiVRpzRm1onQebzY3ZcDq8ysd3R30WnA5IY9NMl1/fuHQfJ694Yzz4ShQ3UJSaQxpQ0Ed18LDAOmAQuACe4+38wqzKx/ms37AnPN7DXgEeCn7v5x9Nm5wFigknDm8GQdj0Hy2He+E2Zhu+IKGDMGDj4Y3nsv7qpE8pN5Dg1Wn0gkPJlMxl2GxGTSJDj9dNh6a/jLX6BPn/TbiAiY2Wx3T6RbT08qS84YOBBmzgzjIh16aOhnyKF/z4hkPQWC5JSysnBr6pFHwgUXwJAh8PnncVclkh8UCJJz2rQJD7JdfTXcdx8cdBC8807cVYnkPgWC5KRmzcLcClOmQGUlJBLwj3/EXZVIblMgSE479thwCamkBPr1Cw+zqV9BpG4UCJLz9tgjdDYfd1wY7uLkkzXHgkhdKBAkL7RqFWZgu+EGePhhOPBAWLw47qpEcosCQfKGWZh054knYOnS0K+gUVNFMqdAkLxTXg7JJJSWhttTf/Mb9SuIZEKBIHmpUyf417/gRz8Kw14MGhRmZhORjVMgSN7aemt48EG45RZ49NEwSN6bb8ZdlUj2UiBIXjMLU3M+9RR88AHsvz9MrW0aJxFRIEhhOPTQ0K+w++7h2YVrroF16+KuSiS7KBCkYHz3u/DCC3DaaWHYi+OOg08/jbsqkeyhQJCC0rIljBsHv/sdPPkk9OwJb7wRd1Ui2UGBIAXHDIYNC2MfffIJ9OoV5loQKXQZBYKZlZvZQjOrNLPLNrHeCWbmZpaIlvuZ2Wwzez36emjKutOjfc6JXjvW/3BEMtenD7zyCuy9Nxx/fLg99auv4q5KJD5pAyGaE3kUcCRQBgw2s7Ja1msFDAdmpjR/CBzr7vsApwP3b7DZKe7eNXp9UMdjEKmzdu3gn/+Es88OD7AdfTR8/HH67UTyUSZnCD2BSndf7O5rgPHAgFrWuxYYCXxR0+Dur7r7smhxPrClmRXXs2aRBlVcDKNHw5/+FC4j7b8/zJ0bd1UiTS+TQGgHLE1ZroravmZm3YBSd9/UHd7HA6+6++qUtnuiy0UjzMwyLVqkMQwdGs4WvvgCDjgAxo+PuyKRppVJINT2h/rrkWHMrBlwG3DRRndgtjdwE3BOSvMp0aWkPtHr1I1sO9TMkmaWrK6uzqBckbo74ACYPRu6d4fBg+Hii2Ht2rirEmkamQRCFVCastweWJay3AroAkw3syVAb2BKSsdye+BR4DR3X1Szkbu/F31dBTxIuDT1Le4+2t0T7p4oKSnJ9LhE6mznneGZZ+D888OwF0ccAR9+GHdVIo0vk0CYBXQ2s45m1gI4CZhS86G7f+rubd29g7t3AGYA/d09aWbbAn8FLnf3F2u2MbPmZtY2er8FcAwwr8GOSqSeWrSAO+8Mzyy8+CLsuSecc04YTnvNmrirE2kcaQPB3dcCw4BpwAJggrvPN7MKM+ufZvNhwO7AiA1uLy0GppnZXGAO8B4wpj4HItIYTj8dXnoJfvCDMFBeeTnstFN42nnyZPj887grFGk45jk0UHwikfBkMhl3GVKgvvgiDJI3cSJMmQIrVoQRVY86CgYODLestmoVd5Ui32Zms909kXY9BYLI5vvyS5g+PTzh/Oij8P774fbVfv3CQ27HHgs77BB3lSKBAkGkiXz1VbisNHFiCIh334WiIjjkkHDm8MMfho5qkbgoEERi4B5uW500KQTEm2+GsZMOPDCcOfzwh9ChQ9xVSqFRIIjEzD2MpFoTDq+9Ftq7dw/hMHBguHtJpLEpEESyzKJFIRwmTYIZM0JbWVkIhuOPh/32C2cTIg1NgSCSxaqq4LHHwpnDc8+F2ds6dQrhMHBgGJK7mQanlwaiQBDJEdXV4ZmGSZPg6afDHUzf+U7obzj++DBMd/PmcVcpuSzTQNC/QURiVlICZ50FTzwRwuHPf4beveHuu8Nc0LvsEib00bDc0tgUCCJZpE0bOOWUcCmpujp8PfzwMDT33nuHB+JEGosCQSRLbb116E944AGYNQt23BEGDIBTT9XZgjQOBYJIDujaNYTCr38d5mno0gUefzzuqiTfKBBEckSLFnDNNfDyy9C2LfTvHwbfW7Ei7sokXygQRHJMt26QTMKIEeFyUpcu8Ne/xl2V5AMFgkgOatECKipg5kzYfns45hgYMgQ++STuyiSXKRBEcliPHuFs4Ve/Crer7r13uH1VpC4UCCI5rrgYrrsuDIex3XZhXoYzztDZgmy+jALBzMrNbKGZVZrZZZtY7wQz85r5lKO2y6PtFprZEZu7TxHJTCIRRlq94gq4//7Qt/Dkk3FXJbkkbSCYWREwCjgSKAMGm1lZLeu1AoYDM1PayghzMO8NlAO/N7OiTPcpIpunuBiuvz6cLbRpE2ZzO/NM+PTTuCuTXJDJGUJPoNLdF7v7GmA8MKCW9a4FRgJfpLQNAMa7+2p3fxuojPaX6T5FpA4SCXjlFbj8chg3LpwtTJsWd1WS7TIJhHbA0pTlqqjta2bWDSh196kZbpt2nyJSP8XFcMMNYTa3Vq2gvDyMmaSzBdmYTAKhthHavx4i1cyaAbcBF23Gtpvc5zd2YDbUzJJmlqyurs6gXBFJ1bNnOFu49FK45x6dLcjGZRIIVUBpynJ7YFnKciugCzDdzJYAvYEpUcfyxrZNt8+vuftod0+4e6KkpCSDckVkQ1tuCTfeCP/6F2yzTThbOPtsWLky7sokm2QSCLOAzmbW0cxaEDqJvx5z0d0/dfe27t7B3TsAM4D+7p6M1jvJzIrNrCPQGXg53T5FpHH06gWvvgq//GUYXrtLF/j73+OuSrJF2kBw97XAMGAasACY4O7zzazCzPqn2XY+MAF4A/gbcL67f7WxfdbvUEQkE1tuCTfdBC++GEZUPeIIGDpUZwuiGdNECtrnn8NVV8Ett0D79jB2LPTrF3dV0tA0Y5qIpNWyJYwcCS+8EM4cDj8czjkHVq2KuzKJgwJBRDjgAJgzBy66CMaMCX0LTz8dd1XS1BQIIgKEs4Wbb15/ttCvH5x7rs4WCokCQUS+4cADw9nChReGuZz32SfMt7B6ddyVSWNTIIjIt7RsGTqan38+zL1wzDHQunW4tPSLX4RpPJcsgRy6J0UyoLuMRGSTPv88zLEwY0Z4zZ4d2gB22ik829C7d3glEmGYDMkumd5lpEAQkc3y5Zfw+ushHGbODF/ffDN81qxZmKSnJiB69YK99grtEh8Fgog0mY8+gpdfXh8QM2eun6CndeswnlJNQPTqBRqFpmkpEEQkNuvWwVtvrb/MNHMmzJ0LX30VPt9tt29eatpvv9BXIY1DgSAiWeV//wv9DzVnES+9BMuXh8+Ki6F7929eatp1V7DaxkWWzaZAEJGs5g5VVesDoqbD+otoiq2dd14fDgcfHN4rIOom00Bo3hTFiIhsyAxKS8PrhBNC25dfhktLqR3Wjz0WPuvTB667Dvr2ja/mfKe+fxHJGltsAT16wPnnw333hbuXPvwQ7rwTKivDmUK/fiEopOEpEEQkq+2wQwiIRYvCw3Jz5oQH5I49NsztIA1HgSAiOaFlyzCcxttvw/XXhzGXuneHQYPgjTfiri4/KBBEJKdssw1ccUUIhhEj4G9/C6OznnpquKwkdadAEJGctO22UFERguHii2HiRNhzzzBX9Lvvxl1dbsooEMys3MwWmlmlmV1Wy+c/NbPXzWyOmb1gZmVR+ylRW81rnZl1jT6bHu2z5rMdG/bQRKQQtG0bJvlZtAjOOy90RnfuDBdcsP45B8lM2ucQzKwIeBPoB1QBs4DB7v5Gyjqt3X1l9L4/cJ67l2+wn32Aye7eKVqeDlzs7hk/WKDnEEQknXffDben3nMPNG8Ow4bBL39Z2MNlNOQUmj2BSndf7O5rgPHAgNQVasIgsjVQW8oMBh7K4PuJiNTZrrvC6NHw73/DiSfCrbdCp06hv6FmfCWpXSaB0A5YmrJcFbV9g5mdb2aLgJHA8Fr28yO+HQj3RJeLRpjV/gyimQ01s6SZJaurqzMoV0QkjJd0770wbx4cdVQ4a+jYMdyhpFngapdJINT2h/pbZwDuPsrddwMuBa78xg7MegGfufu8lOZT3H0foE/0OrW2b+7uo9094e6JkkI+5xOROtlrL3j44fDMQt++cOWV4Yzhllvgs8/iri67ZBIIVUBpynJ7YNkm1h8PHLdB20lscHbg7u9FX1cBDxIuTYmINIquXWHy5DAkRvfu4c6k3XeHUaM0PWiNTAJhFtDZzDqaWQvCH/cpqSuYWeeUxaOBt1I+awYMIgRFTVtzM2sbvd8COAZIPXsQEWkUPXvCtGnwz3+GQBg2DPbYA+66K4ylVMjSBoK7rwWGAdOABcAEd59vZhXRHUUAw8xsvpnNAS4ETk/ZRV+gyt0Xp7QVA9PMbC4wB3gPGFP/wxERyUzfviEU/v73MLLqWWeFy0t//vP6eRsKjYa/FpGC5w5Tp4Y7kV57LQRDRQUMHJgf03825G2nIiJ5zSwMlvfKKzBhQgiIQYPCyKtTp4blQqBAEBGJNGsWgmDevPDE88qVISgSCbj88jA3Qz4//axLRiIiG/HllzBuHIwZE25bXbs2tJeWrp/NrVevcNfSVlvFWuomaQpNEZEG9PnnIRRmzlz/WrIkfFZUBPvttz4gevcO4yllS/+DAkFEpJG9//76cJgxA2bNWv8U9LbbhltcawKiZ88wEF8cFAgiIk3sq6/CGEo1c0LPnBn6I9atC5/vtts3LzV17QotWjR+XQoEEZEs8N//QjL5zTOJmo7p4mLo1u2bl5o6dAh3PTUkBYKISBZyh6qqbwbE7NmhjwLCMN2pAbH//tCmTf2+Z6aB0Lx+30ZERDZyEQLeAAAEIUlEQVSHWbhLqbQUTjghtH35Zbi0lHqpaerU9evvuWeYEW6vvRq3NgWCiEjMttgiXDrq1g3OPTe0rVgROqlrAqLdtyYdaHgKBBGRLLTddnD44eHVVLLkLlkREYmbAkFERAAFgoiIRBQIIiICKBBERCSiQBAREUCBICIiEQWCiIgAOTaWkZlVA+/EXUdM2gIfxl1EjHT8On4df919191L0q2UU4FQyMwsmcngVPlKx6/j1/E3/vHrkpGIiAAKBBERiSgQcsfouAuImY6/sOn4m4D6EEREBNAZgoiIRBQIWcjMSs3sWTNbYGbzzexnUfv2ZvaUmb0Vfd0u7lobi5kVmdmrZjY1Wu5oZjOjY3/YzJpgavL4mNm2ZvaImf07+j04oMB+/r+IfvfnmdlDZrZlPv8OmNndZvaBmc1Laav1523BHWZWaWZzzax7Q9WhQMhOa4GL3H0voDdwvpmVAZcBz7h7Z+CZaDlf/QxYkLJ8E3BbdOwrgDNjqarp/Bb4m7vvCexH+G9RED9/M2sHDAcS7t4FKAJOIr9/B8YB5Ru0beznfSTQOXoNBf7QUEUoELKQuy9391ei96sIfwzaAQOAe6PV7gWOi6fCxmVm7YGjgbHRsgGHAo9Eq+TtsQOYWWugL3AXgLuvcfdPKJCff6Q50NLMmgNbAcvJ498Bd38O+HiD5o39vAcA93kwA9jWzHZpiDoUCFnOzDoA3YCZwE7uvhxCaAA7xldZo7od+CWwLlreAfjE3ddGy1WEgMxXnYBq4J7ostlYM9uaAvn5u/t7wM3Au4Qg+BSYTWH9DsDGf97tgKUp6zXYfwsFQhYzs22AicDP3X1l3PU0BTM7BvjA3WenNteyaj7fHtcc6A78wd27Af8jTy8P1Sa6Vj4A6Ah8B9iacJlkQ/n8O7Apjfb/gwIhS5nZFoQweMDdJ0XN79ecGkZfP4irvkb0f0B/M1sCjCdcJridcFrcPFqnPbAsnvKaRBVQ5e4zo+VHCAFRCD9/gB8Ab7t7tbt/CUwCDqSwfgdg4z/vKqA0Zb0G+2+hQMhC0TXzu4AF7n5rykdTgNOj96cDk5u6tsbm7pe7e3t370DoSPyHu58CPAucEK2Wl8dew93/Ayw1s+9FTYcBb1AAP//Iu0BvM9sq+n+h5vgL5ncgsrGf9xTgtOhuo97ApzWXlupLD6ZlITM7CHgeeJ3119GvIPQjTAB2JfxPM8jdN+yIyhtm9n3gYnc/xsw6Ec4YtgdeBX7s7qvjrK8xmVlXQqd6C2AxcAbhH3AF8fM3s2uAHxHuuHsVOItwnTwvfwfM7CHg+4RRTd8HrgIeo5afdxSSdxLuSvoMOMPdkw1ShwJBRERAl4xERCSiQBAREUCBICIiEQWCiIgACgQREYkoEEREBFAgiIhIRIEgIiIA/D9eRUZmTBn5SgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同聚类数目的模型的性能，找到最佳模型／参数（分数最高）\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "通过图片发现，K值越小，分数越高，所以可以考虑使用最小的K值。但是要防止把所有的特征聚为一类\n",
    "所以不能选取的太小"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
